export VLLM_ATTENTION_BACKEND=XFORMERS
export PORJECT_PATH=/llm/nankai/xuyang_space/project/r1_infra/template_r1

# export WANDB_MODE=offline
export TMPDIR=/home/xuyang/ray_tmp

export ROLLOUT_TP_SIZE=1
export CUDA_VISIBLE_DEVICES=1
export N_GPUS=1

# export ROLLOUT_TP_SIZE=1
# export CUDA_VISIBLE_DEVICES=2
# export N_GPUS=1

# set model
export MODEL_ROOT=/llm/nankai/xuyang_space/LLMs
export MODEL_NAME=Qwen2.5-3B
# export MODEL_NAME=Qwen2.5-7B-Instruct

# export MODEL_ROOT=/llm/nankai/xuyang_space/LLMs
# export MODEL_NAME=Qwen2.5-3B

export BASE_MODEL=$MODEL_ROOT/$MODEL_NAME
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-7B-agent_cdm_sft_0624/full/sft/checkpoint-247

# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-3B

# export BASE_MODEL=/llm/HFModels/Qwen2.5-7B-Instruct
# export BASE_MODEL=/llm/HFModels/Llama-3-8B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-1.5B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-1.5B
# export BASE_MODEL=/llm/HFModels/Qwen2.5-3B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B/full/sft/checkpoint-500
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right/full/sft/checkpoint-500
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0610/full/sft/checkpoint-500

# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0612/full/sft
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0613/full/sft/checkpoint-33

# set data
export DATA_ROOT=/llm/nankai/xuyang_space/data
# export DATA_NAME=dapo
# export DATA_NAME=grpo
# export DATA_DIR=$DATA_ROOT/$DATA_NAME/data
export DATA_DIR=$DATA_ROOT

# set reward function
# export REWARD_FUNCTION=agent_cdm_14
# export CUSTOM_REWARD_FUNCTION=$PORJECT_PATH/reward_score/$REWARD_FUNCTION.py


export PROJECT_NAME=template_r1
export LOG_PATH=$PORJECT_PATH

# export LEARNING_RATE=3e-6
export LEARNING_RATE=3e-6
export ROLLOUT_N=5
export BATCH_SIZE=32

export DATA_SHUFFLE=False


export MAX_RESPONSE_LENGTH=1024


# export RESUME_PATH=/llm/nankai/xuyang_space/project/r1_infra/verl/checkpoints/AgentCDM/Qwen2.5-7B-grpo-agent_cdm_13-data20250701-2048-3e-6-256-5-auto-False-0701/global_step_20
# export RESUME_MODE=resume_path

export RESUME_PATH=null
export RESUME_MODE=auto

# export algorithm=dapo
export algorithm=grpo
# export algorithm=rf


export TODAY_DATA=0705
export EXPERIMENT_NAME=$MODEL_NAME-$algorithm-$DATA_NAME-$MAX_RESPONSE_LENGTH-$LEARNING_RATE-$BATCH_SIZE-$ROLLOUT_N-$RESUME_MODE-$DATA_SHUFFLE-$TODAY_DATA
export OUTPUT_DIR=$PORJECT_PATH/outputs/$EXPERIMENT_NAME


# bash /llm/nankai/xuyang_space/project/r1_infra/template_r1/scripts/run_qwen2_5-3b_gsm8k_grpo_lora.sh
bash /llm/nankai/xuyang_space/project/r1_infra/template_r1/scripts/qwen2-3b_grpo.sh